#!/usr/bin/python
# -*- coding: utf-8 -*-

from mongodb import mongodb
import re
import time

class ModifyWrongField:

    def __init__ (self):
        dbcl = mongodb()
        self.db = dbcl.get_db()

    def modifySiteNameField (self):
        rsts = self.db.newsv1.find()

        print rsts.count()

        count = 0
        for rst in rsts:
            siteName = self.modifySiteName(rst['url'])
            self.db.newsv1.update({'_id': rst['_id']}, {'$set':{'site_name':siteName}})

            if count%1000 == 0:
                print ("Processed "+ str(count)+" news")
            count  += 1

    def removePTagField(self):
        rsts = self.db.newsv1.find()
        print rsts.count()

        count = 0
        for rst in rsts:
            title = rst['title']
            if title:
                title = title.strip().replace('</p>', '')
            else:
                title = ''

            author = rst['author']
            if author:
                author = author.strip().replace('</p>', '')

            self.db.newsv1.update({'_id': rst['_id']}, {'$set':{'title':title, 'author': author}})
            count += 1

            if count %1000 == 0:
                print 'process records ' + str(count)

    #去掉正文里面存留的html标签以及一些噪音
    def removeNoise(self):
        rsts = self.db.newsv1.find()
        print rsts.count()

        count = 0
        for rst in rsts:
            content = rst['content']

            if content:
                content = self.cleanText(content)
            else:
                continue

            self.db.newsv1.update({'_id': rst['_id']}, {'$set': {'content':content}})
            count += 1

            if count%1000 == 0:
                print 'process records ' + str(count)


    def string2Timestamp(self, TimeString):
        if(len(TimeString) == 0):
            return 0
        #查找是否含有中文
        chinese = re.compile(u"[\u4e00-\u9fa5]")
        if chinese.findall(TimeString):
            timeTemp = chinese.sub('-',TimeString)
            timeTemp=timeTemp[:timeTemp.rfind('-')]+timeTemp[timeTemp.rfind('-')+1:]
        else:
            timeTemp=TimeString
        #查找 ：
        r=re.compile(r':')
        if timeTemp.find(":") == -1 and len(timeTemp.split(" "))==1:
            timeTemp += " 00"
        
        while len(r.findall(timeTemp))<2:
            timeTemp += ':00'
        return  time.mktime(time.strptime(timeTemp, "%Y-%m-%d %H:%M:%S"))
        
    def isfloat (self,publishTime):
        try:
            float(publishtime)
            return 1
        except:
            return 0

    def modifySiteName(self, url):
        if url.find('sohu.com') > 0:
            return '搜狐汽车'

        if url.find('um.auto.sina.com') >= 0:
            return '新浪车致'

        if url.find('sina.com.cn') >= 0:
            return '新浪汽车'

        if url.find('xcar.com.cn') >= 0:
            return '爱卡汽车'

        if url.find('yiqishuo.yiche.com') >= 0:
            return '易车说客'

        if url.find('bitauto.com') >= 0:
            return '易车'

        if url.find('pcauto') >= 0:
            return '太平洋汽车'

        if url.find('autohome.com.cn') >= 0:
            return '汽车之家'

        return ''

    def modifyTitle(self, title):
        title = title.strip()
        title.replace('</p>', '')

        return title

    def cleanText(self, text):
        #去掉表格
        #r = re.compile(r'<table.*?</table>')
        #text = r.sub('',text)
        #去掉图片
        #r = re.compile(r'<img.*?>')
        #text = r.sub('',text)

        #去掉排版格式及残留杂质
        #r = re.compile(r'<p>|<\/p>|<br>|<hr>|<dl>|<\/dl>|<dt>|<\/dt>|<dd>|<\/dd>')
        #text = r.sub('',text)

        #去掉空行
        #r = re.compile(r'\n+')
        #text = r.sub('',text)

        #去掉图片名称
        #r = re.compile(r'『.*?』')
        #text = r.sub('',text)

        #去掉类[汽车之家]前缀
        #r = re.compile(r'\[.*?\]')
        #text = r.sub('',text)

        #去掉突兀符号
        #r = re.compile(r'●|◆|■')
        #text = r.sub('',text)

        # 合并多个空格为一个
        r = re.compile(r'\s{1,}')
        text = r.sub(' ', text)

        r=re.compile(r'<.*?>')
        content=''.join(r.split(text))

        return text


if __name__=="__main__":
    FieldHandle = ModifyWrongField()
    FieldHandle.removeNoise()

